df= pd.read_csv('en.openfoodfacts.org.products.csv', sep='\t', #skip_blank_lines = True) clean.vm(df,0.001)
##Sélection des colonnes qui m'intéressent (20 au total) ls = ['code', 'url', 'product_name', 'brands','countries_en', 'pnns_groups_1','product_quantity','ingredients_text','sugars_100g', 'saturated-fat_100g','fiber_100g','salt_100g','additives_n', 'energy-kcal_100g', 'carbohydrates_100g', 'proteins_100g','nova_group', 'nutriscore_grade'] df= pd.read_csv('en.openfoodfacts.org.products.csv', sep='\t', skip_blank_lines = True, usecols = ls, dtype={'nova_group': 'category', 'nutriscore_grade': 'category' })[ls] ##Enregistrement du dataframe sous format .csv. df.to_csv('/Users/JoycyRobert/Documents/JUPYTER/OPC/PROJET3/P3/df_export.csv',index=False)
# PNNS_ = pd.DataFrame() PNNS_['GROUPE'] = df['pnns_groups_1'].unique() PNNS_ = PNNS_.drop(0) PNNS_ = PNNS_.reset_index(drop=True) col = df['pnns_groups_1'].value_counts() col = col.reset_index() PNNS_ = PNNS_.merge(col, left_on='GROUPE', right_on='index') PNNS_ = PNNS_.drop(columns = 'index') PNNS_ =PNNS_.sort_values('pnns_groups_1',ascending = False) PNNS_= PNNS_.drop(columns = 'pnns_groups_1') classement = np.arange(1,10) classement PNNS_['#REF'] = classement PNNS_dic_groupe= PNNS_.pivot_table(index = 'GROUPE').to_dict() dic_groupe = {'Boissons sans sucres': 6, #copié/collé 'Collation salée': 9, 'Céréales': 4, 'Fruits et légumes': 7, 'Matières grasses': 5, 'Plats cuisinés': 8, 'Produits Laitiers': 3, 'Sucreries': 1, 'Viandes': 2}df['GROUPE'] = df['pnns_groups_1'] df['GROUPE'] = df['GROUPE'].replace(dic_groupe) df['pnns_groups_1'] = df['pnns_groups_1'].astype('category')
Essai non concluant de vérification des urls. Certains lien engendrent une erreur type 'Bar code introuvable'. Impossible à détecter avec le validator. import validators for i, value in df['URL'].items(): testurl = validators.url(value) df.loc[i,'TEST_URL']= testurl
df = pd.read_csv('df_export2.csv', dtype={'NOVA_GROUP': 'category', 'NUTRISCORE_GRADE': 'category', 'CODE':'object', 'PNNS_GROUPS_1':'category'})
df[df['CODE'].str[:8]=='32661911'].BRANDS.describe() #3266191107608fig, ax = plt.subplots(figsize=(8,8)) sns.set() sns.countplot(df[df['CODE'].str[:8]=='32661911'].BRANDS) ax.set_xlabel('') plt.yscale('log') ax.set_title('Cas de LA VIE CLAIRE') plt.setp(ax.get_xticklabels(), rotation=45, ha="right",rotation_mode="anchor") plt.savefig('P3_B_LAVIEC.png',dpi = 150,bbox_inches = 'tight')df[df['CODE'].str[:8]=='32661911'].loc[df['BRANDS'].isna()]df['BRANDS'].unique().shapedf.loc[(df['CODE'].str[:8]=='32661911') & (df['BRANDS'].isna()),'BRANDS'] = 'LA VIE CLAIRE'
for var in var_quant: print(var) ind_vm = df[df[var].isna()].index #print(ind_vm) for i in ind_vm: a = df.loc[i,'PRODUCT_NAME'].split(' ') #print(a) if len(a)==1: var_med = df.loc[df['PRODUCT_NAME'].str.contains(a[0], regex= False), var].describe()['50%'] df.loc[i,var] = var_med else: var_med = df.loc[df['PRODUCT_NAME'].str.contains(a[0]+' '+a[1], regex= False), var].describe()['50%'] df.loc[i,var] = var_medimport re text_data.Text = [re.sub('['+punctuation+']', ' ', sent) for sent in text_data.Text] [print(sent) for sent in text_data.Text]